# Get mode and purpose distributions from ALL tours (not just matched)
# This ensures consistency with the distribution tables above
# Get mode distributions - ensure all mode codes 0-9 are included
all_mode_codes = list(range(10))
mode_labels = [mode_names.get(i, f"Mode {i}") for i in all_mode_codes]
legacy_mode_counts = (
legacy_tours.group_by("tmodetp")
.agg(pl.len().alias("count"))
.sort("tmodetp")
)
new_mode_counts = (
new_tours.group_by("tmodetp")
.agg(pl.len().alias("count"))
.sort("tmodetp")
)
# Build arrays with all mode codes, filling missing with 0
legacy_by_mode = []
new_by_mode = []
for mode_code in all_mode_codes:
legacy_count = legacy_mode_counts.filter(pl.col("tmodetp") == mode_code)
new_count = new_mode_counts.filter(pl.col("tmodetp") == mode_code)
legacy_by_mode.append(legacy_count["count"][0] if len(legacy_count) > 0 else 0)
new_by_mode.append(new_count["count"][0] if len(new_count) > 0 else 0)
legacy_by_mode = np.array(legacy_by_mode)
new_by_mode = np.array(new_by_mode)
# Get purpose distributions (with semantic mapping for legacy)
# Ensure all purpose codes 0-9 are included
all_purpose_codes = list(range(10))
purpose_labels = [purpose_names.get(i, f"Purpose {i}") for i in all_purpose_codes]
legacy_tours_semantic = legacy_tours.with_columns(
pl.col("pdpurp").replace_strict(legacy_to_semantic, default=pl.col("pdpurp")).alias("semantic_purp")
)
legacy_purpose_counts = (
legacy_tours_semantic.group_by("semantic_purp")
.agg(pl.len().alias("count"))
.sort("semantic_purp")
)
new_purpose_counts = (
new_tours.group_by("pdpurp")
.agg(pl.len().alias("count"))
.sort("pdpurp")
)
# Build arrays with all purpose codes, filling missing with 0
legacy_by_purpose = []
new_by_purpose = []
for purpose_code in all_purpose_codes:
legacy_count = legacy_purpose_counts.filter(pl.col("semantic_purp") == purpose_code)
new_count = new_purpose_counts.filter(pl.col("pdpurp") == purpose_code)
legacy_by_purpose.append(legacy_count["count"][0] if len(legacy_count) > 0 else 0)
new_by_purpose.append(new_count["count"][0] if len(new_count) > 0 else 0)
legacy_by_purpose = np.array(legacy_by_purpose)
new_by_purpose = np.array(new_by_purpose)
from scipy import stats
# Calculate correlations
r_mode = stats.pearsonr(legacy_by_mode, new_by_mode)[0]
r_purpose = stats.pearsonr(legacy_by_purpose, new_by_purpose)[0]
# Create subplots for mode and purpose scatter plots
fig_scatter = make_subplots(
rows=1, cols=2,
subplot_titles=(
f'Mode Distribution Comparison<br><sub>R² = {r_mode**2:.4f}</sub>',
f'Purpose Distribution Comparison<br><sub>R² = {r_purpose**2:.4f}</sub>'
),
horizontal_spacing=0.12
)
# 1. Mode Scatter Plot
max_mode = max(legacy_by_mode.max(), new_by_mode.max())
fig_scatter.add_trace(
go.Scatter(
x=legacy_by_mode,
y=new_by_mode,
mode='markers',
marker=dict(size=12, opacity=0.7, color='steelblue'),
text=mode_labels,
hovertemplate='<b>%{text}</b><br>Legacy: %{x:,.0f}<br>New: %{y:,.0f}<extra></extra>',
name='Modes'
),
row=1, col=1
)
fig_scatter.add_trace(
go.Scatter(
x=[0, max_mode],
y=[0, max_mode],
mode='lines',
line=dict(color='red', dash='dash', width=1),
name='Perfect Match',
hoverinfo='skip',
showlegend=False
),
row=1, col=1
)
# 2. Purpose Scatter Plot
max_purpose = max(legacy_by_purpose.max(), new_by_purpose.max())
fig_scatter.add_trace(
go.Scatter(
x=legacy_by_purpose,
y=new_by_purpose,
mode='markers',
marker=dict(size=12, opacity=0.7, color='darkgreen'),
text=purpose_labels,
hovertemplate='<b>%{text}</b><br>Legacy: %{x:,.0f}<br>New: %{y:,.0f}<extra></extra>',
name='Purposes'
),
row=1, col=2
)
fig_scatter.add_trace(
go.Scatter(
x=[0, max_purpose],
y=[0, max_purpose],
mode='lines',
line=dict(color='red', dash='dash', width=1),
name='Perfect Match',
hoverinfo='skip',
showlegend=False
),
row=1, col=2
)
# Update axes with fixed aspect ratio
fig_scatter.update_xaxes(title_text='Legacy Tour Count', row=1, col=1)
fig_scatter.update_yaxes(title_text='New Tour Count', scaleanchor='x', scaleratio=1, row=1, col=1)
fig_scatter.update_xaxes(title_text='Legacy Tour Count', row=1, col=2)
fig_scatter.update_yaxes(title_text='New Tour Count', scaleanchor='x2', scaleratio=1, row=1, col=2)
fig_scatter.update_layout(
height=500,
showlegend=False,
template='plotly_white',
hovermode='closest',
margin=dict(t=80, b=50, l=50, r=50)
)
fig_scatter.show()
# 2. Sankey Diagrams: Mode and Purpose Flow
# Use matched tours to show how classifications flow from legacy to new
# Mode Sankey
mode_flow = (
matched.group_by(['tmodetp_leg', 'tmodetp_new'])
.agg(pl.len().alias('count'))
.sort('count', descending=True)
)
# Create source/target lists for mode
mode_sources = []
mode_targets = []
mode_values = []
mode_labels = list(mode_names.values())
n_modes = len(mode_labels)
# Build node labels: Legacy modes first, then New modes
mode_node_labels = [f"Legacy: {label}" for label in mode_labels] + [f"New: {label}" for label in mode_labels]
for row in mode_flow.iter_rows(named=True):
legacy_mode = row['tmodetp_leg']
new_mode = row['tmodetp_new']
count = row['count']
# Find indices
legacy_idx = list(mode_names.keys()).index(legacy_mode)
new_idx = list(mode_names.keys()).index(new_mode) + n_modes
mode_sources.append(legacy_idx)
mode_targets.append(new_idx)
mode_values.append(count)
# Purpose Sankey (with semantic mapping for legacy)
matched_purpose = matched.with_columns(
pl.col("pdpurp_leg").replace_strict(legacy_to_semantic, default=pl.col("pdpurp_leg")).alias("pdpurp_leg_semantic")
)
purpose_flow = (
matched_purpose.group_by(['pdpurp_leg_semantic', 'pdpurp_new'])
.agg(pl.len().alias('count'))
.sort('count', descending=True)
)
# Create source/target lists for purpose
purpose_sources = []
purpose_targets = []
purpose_values = []
purpose_labels_list = [purpose_names[k] for k in sorted([k for k in purpose_names.keys() if k < 10])]
n_purposes = len(purpose_labels_list)
# Build node labels: Legacy purposes first, then New purposes
purpose_node_labels = [f"Legacy: {label}" for label in purpose_labels_list] + [f"New: {label}" for label in purpose_labels_list]
purpose_keys = sorted([k for k in purpose_names.keys() if k < 10])
for row in purpose_flow.iter_rows(named=True):
legacy_purpose = row['pdpurp_leg_semantic']
new_purpose = row['pdpurp_new']
count = row['count']
try:
legacy_idx = purpose_keys.index(legacy_purpose)
new_idx = purpose_keys.index(new_purpose) + n_purposes
purpose_sources.append(legacy_idx)
purpose_targets.append(new_idx)
purpose_values.append(count)
except ValueError:
continue
# Create Sankey diagrams
fig_sankey = make_subplots(
rows=1, cols=2,
subplot_titles=('Mode Classification Flow', 'Purpose Classification Flow'),
specs=[[{"type": "sankey"}, {"type": "sankey"}]],
horizontal_spacing=0.05
)
# Mode Sankey
fig_sankey.add_trace(
go.Sankey(
node=dict(
pad=15,
thickness=20,
line=dict(color="black", width=0.5),
label=mode_node_labels,
),
link=dict(
source=mode_sources,
target=mode_targets,
value=mode_values,
)
),
row=1, col=1
)
# Purpose Sankey
fig_sankey.add_trace(
go.Sankey(
node=dict(
pad=15,
thickness=20,
line=dict(color="black", width=0.5),
label=purpose_node_labels,
),
link=dict(
source=purpose_sources,
target=purpose_targets,
value=purpose_values,
)
),
row=1, col=2
)
fig_sankey.update_layout(
height=600,
margin=dict(t=80, b=50, l=50, r=50)
)
fig_sankey.show()
# Calculate summary statistics
total_legacy = len(legacy_tours)
total_new = len(new_tours)
total_diff = total_new - total_legacy
from IPython.display import Markdown, display
display(Markdown(f"""
**Summary Statistics:**
- **Total Tours:** Legacy = {total_legacy:,} | New = {total_new:,} | Diff = {total_diff:+,}
- **Mode Correlation (R):** {r_mode:.4f} (R² = {r_mode**2:.4f})
- **Purpose Correlation (R):** {r_purpose:.4f} (R² = {r_purpose**2:.4f})
"""))